{% extends 'base.html' %} {% block page_content %}
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore') #del warnings
importing the dataset :
path = "C:/Users/sarav/Desktop/A4/S7/Python/Project/"
df = pd.read_csv(path + "OnlineNewsPopularity.csv")
We can have a first glance a the dataset by using pd.info():
df.head()
| url | timedelta | n_tokens_title | n_tokens_content | n_unique_tokens | n_non_stop_words | n_non_stop_unique_tokens | num_hrefs | num_self_hrefs | num_imgs | ... | min_positive_polarity | max_positive_polarity | avg_negative_polarity | min_negative_polarity | max_negative_polarity | title_subjectivity | title_sentiment_polarity | abs_title_subjectivity | abs_title_sentiment_polarity | shares | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | http://mashable.com/2013/01/07/amazon-instant-... | 731.0 | 12.0 | 219.0 | 0.663594 | 1.0 | 0.815385 | 4.0 | 2.0 | 1.0 | ... | 0.100000 | 0.7 | -0.350000 | -0.600 | -0.200000 | 0.500000 | -0.187500 | 0.000000 | 0.187500 | 593 |
| 1 | http://mashable.com/2013/01/07/ap-samsung-spon... | 731.0 | 9.0 | 255.0 | 0.604743 | 1.0 | 0.791946 | 3.0 | 1.0 | 1.0 | ... | 0.033333 | 0.7 | -0.118750 | -0.125 | -0.100000 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 711 |
| 2 | http://mashable.com/2013/01/07/apple-40-billio... | 731.0 | 9.0 | 211.0 | 0.575130 | 1.0 | 0.663866 | 3.0 | 1.0 | 1.0 | ... | 0.100000 | 1.0 | -0.466667 | -0.800 | -0.133333 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1500 |
| 3 | http://mashable.com/2013/01/07/astronaut-notre... | 731.0 | 9.0 | 531.0 | 0.503788 | 1.0 | 0.665635 | 9.0 | 0.0 | 1.0 | ... | 0.136364 | 0.8 | -0.369697 | -0.600 | -0.166667 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1200 |
| 4 | http://mashable.com/2013/01/07/att-u-verse-apps/ | 731.0 | 13.0 | 1072.0 | 0.415646 | 1.0 | 0.540890 | 19.0 | 19.0 | 20.0 | ... | 0.033333 | 1.0 | -0.220192 | -0.500 | -0.050000 | 0.454545 | 0.136364 | 0.045455 | 0.136364 | 505 |
5 rows × 61 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 39644 entries, 0 to 39643 Data columns (total 61 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 url 39644 non-null object 1 timedelta 39644 non-null float64 2 n_tokens_title 39644 non-null float64 3 n_tokens_content 39644 non-null float64 4 n_unique_tokens 39644 non-null float64 5 n_non_stop_words 39644 non-null float64 6 n_non_stop_unique_tokens 39644 non-null float64 7 num_hrefs 39644 non-null float64 8 num_self_hrefs 39644 non-null float64 9 num_imgs 39644 non-null float64 10 num_videos 39644 non-null float64 11 average_token_length 39644 non-null float64 12 num_keywords 39644 non-null float64 13 data_channel_is_lifestyle 39644 non-null float64 14 data_channel_is_entertainment 39644 non-null float64 15 data_channel_is_bus 39644 non-null float64 16 data_channel_is_socmed 39644 non-null float64 17 data_channel_is_tech 39644 non-null float64 18 data_channel_is_world 39644 non-null float64 19 kw_min_min 39644 non-null float64 20 kw_max_min 39644 non-null float64 21 kw_avg_min 39644 non-null float64 22 kw_min_max 39644 non-null float64 23 kw_max_max 39644 non-null float64 24 kw_avg_max 39644 non-null float64 25 kw_min_avg 39644 non-null float64 26 kw_max_avg 39644 non-null float64 27 kw_avg_avg 39644 non-null float64 28 self_reference_min_shares 39644 non-null float64 29 self_reference_max_shares 39644 non-null float64 30 self_reference_avg_sharess 39644 non-null float64 31 weekday_is_monday 39644 non-null float64 32 weekday_is_tuesday 39644 non-null float64 33 weekday_is_wednesday 39644 non-null float64 34 weekday_is_thursday 39644 non-null float64 35 weekday_is_friday 39644 non-null float64 36 weekday_is_saturday 39644 non-null float64 37 weekday_is_sunday 39644 non-null float64 38 is_weekend 39644 non-null float64 39 LDA_00 39644 non-null float64 40 LDA_01 39644 non-null float64 41 LDA_02 39644 non-null float64 42 LDA_03 39644 non-null float64 43 LDA_04 39644 non-null float64 44 global_subjectivity 39644 non-null float64 45 global_sentiment_polarity 39644 non-null float64 46 global_rate_positive_words 39644 non-null float64 47 global_rate_negative_words 39644 non-null float64 48 rate_positive_words 39644 non-null float64 49 rate_negative_words 39644 non-null float64 50 avg_positive_polarity 39644 non-null float64 51 min_positive_polarity 39644 non-null float64 52 max_positive_polarity 39644 non-null float64 53 avg_negative_polarity 39644 non-null float64 54 min_negative_polarity 39644 non-null float64 55 max_negative_polarity 39644 non-null float64 56 title_subjectivity 39644 non-null float64 57 title_sentiment_polarity 39644 non-null float64 58 abs_title_subjectivity 39644 non-null float64 59 abs_title_sentiment_polarity 39644 non-null float64 60 shares 39644 non-null int64 dtypes: float64(59), int64(1), object(1) memory usage: 18.5+ MB
We can see that the data set is already clean: there are no NaN values and all features are float features (apart from the URL)
with pd.option_context('display.max_columns', 60):
print(df.describe(include = "all")) # to see all columns
url timedelta \
count 39644 39644.000000
unique 39644 NaN
top http://mashable.com/2013/05/23/college-basebal... NaN
freq 1 NaN
mean NaN 354.530471
std NaN 214.163767
min NaN 8.000000
25% NaN 164.000000
50% NaN 339.000000
75% NaN 542.000000
max NaN 731.000000
n_tokens_title n_tokens_content n_unique_tokens \
count 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 10.398749 546.514731 0.548216
std 2.114037 471.107508 3.520708
min 2.000000 0.000000 0.000000
25% 9.000000 246.000000 0.470870
50% 10.000000 409.000000 0.539226
75% 12.000000 716.000000 0.608696
max 23.000000 8474.000000 701.000000
n_non_stop_words n_non_stop_unique_tokens num_hrefs \
count 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 0.996469 0.689175 10.883690
std 5.231231 3.264816 11.332017
min 0.000000 0.000000 0.000000
25% 1.000000 0.625739 4.000000
50% 1.000000 0.690476 8.000000
75% 1.000000 0.754630 14.000000
max 1042.000000 650.000000 304.000000
num_self_hrefs num_imgs num_videos average_token_length \
count 39644.000000 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 3.293638 4.544143 1.249874 4.548239
std 3.855141 8.309434 4.107855 0.844406
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 1.000000 0.000000 4.478404
50% 3.000000 1.000000 0.000000 4.664082
75% 4.000000 4.000000 1.000000 4.854839
max 116.000000 128.000000 91.000000 8.041534
num_keywords data_channel_is_lifestyle \
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean 7.223767 0.052946
std 1.909130 0.223929
min 1.000000 0.000000
25% 6.000000 0.000000
50% 7.000000 0.000000
75% 9.000000 0.000000
max 10.000000 1.000000
data_channel_is_entertainment data_channel_is_bus \
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean 0.178009 0.157855
std 0.382525 0.364610
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 1.000000 1.000000
data_channel_is_socmed data_channel_is_tech \
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean 0.058597 0.185299
std 0.234871 0.388545
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 1.000000 1.000000
data_channel_is_world kw_min_min kw_max_min kw_avg_min \
count 39644.000000 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 0.212567 26.106801 1153.951682 312.366967
std 0.409129 69.633215 3857.990877 620.783887
min 0.000000 -1.000000 0.000000 -1.000000
25% 0.000000 -1.000000 445.000000 141.750000
50% 0.000000 -1.000000 660.000000 235.500000
75% 0.000000 4.000000 1000.000000 357.000000
max 1.000000 377.000000 298400.000000 42827.857143
kw_min_max kw_max_max kw_avg_max kw_min_avg \
count 39644.000000 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 13612.354102 752324.066694 259281.938083 1117.146610
std 57986.029357 214502.129573 135102.247285 1137.456951
min 0.000000 0.000000 0.000000 -1.000000
25% 0.000000 843300.000000 172846.875000 0.000000
50% 1400.000000 843300.000000 244572.222223 1023.635611
75% 7900.000000 843300.000000 330980.000000 2056.781032
max 843300.000000 843300.000000 843300.000000 3613.039819
kw_max_avg kw_avg_avg self_reference_min_shares \
count 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 5657.211151 3135.858639 3998.755396
std 6098.871957 1318.150397 19738.670516
min 0.000000 0.000000 0.000000
25% 3562.101631 2382.448566 639.000000
50% 4355.688836 2870.074878 1200.000000
75% 6019.953968 3600.229564 2600.000000
max 298400.000000 43567.659946 843300.000000
self_reference_max_shares ... weekday_is_monday \
count 39644.000000 ... 39644.000000
unique NaN ... NaN
top NaN ... NaN
freq NaN ... NaN
mean 10329.212662 ... 0.168020
std 41027.576613 ... 0.373889
min 0.000000 ... 0.000000
25% 1100.000000 ... 0.000000
50% 2800.000000 ... 0.000000
75% 8000.000000 ... 0.000000
max 843300.000000 ... 1.000000
weekday_is_tuesday weekday_is_wednesday weekday_is_thursday \
count 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 0.186409 0.187544 0.183306
std 0.389441 0.390353 0.386922
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000
weekday_is_friday weekday_is_saturday weekday_is_sunday \
count 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 0.143805 0.061876 0.069039
std 0.350896 0.240933 0.253524
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000
is_weekend LDA_00 LDA_01 LDA_02 LDA_03 \
count 39644.000000 39644.000000 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN
mean 0.130915 0.184599 0.141256 0.216321 0.223770
std 0.337312 0.262975 0.219707 0.282145 0.295191
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.025051 0.025012 0.028571 0.028571
50% 0.000000 0.033387 0.033345 0.040004 0.040001
75% 0.000000 0.240958 0.150831 0.334218 0.375763
max 1.000000 0.926994 0.925947 0.919999 0.926534
LDA_04 global_subjectivity global_sentiment_polarity \
count 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 0.234029 0.443370 0.119309
std 0.289183 0.116685 0.096931
min 0.000000 0.000000 -0.393750
25% 0.028574 0.396167 0.057757
50% 0.040727 0.453457 0.119117
75% 0.399986 0.508333 0.177832
max 0.927191 1.000000 0.727841
global_rate_positive_words global_rate_negative_words \
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean 0.039625 0.016612
std 0.017429 0.010828
min 0.000000 0.000000
25% 0.028384 0.009615
50% 0.039023 0.015337
75% 0.050279 0.021739
max 0.155488 0.184932
rate_positive_words rate_negative_words avg_positive_polarity \
count 39644.000000 39644.000000 39644.000000
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 0.682150 0.287934 0.353825
std 0.190206 0.156156 0.104542
min 0.000000 0.000000 0.000000
25% 0.600000 0.185185 0.306244
50% 0.710526 0.280000 0.358755
75% 0.800000 0.384615 0.411428
max 1.000000 1.000000 1.000000
min_positive_polarity max_positive_polarity \
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean 0.095446 0.756728
std 0.071315 0.247786
min 0.000000 0.000000
25% 0.050000 0.600000
50% 0.100000 0.800000
75% 0.100000 1.000000
max 1.000000 1.000000
avg_negative_polarity min_negative_polarity \
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean -0.259524 -0.521944
std 0.127726 0.290290
min -1.000000 -1.000000
25% -0.328383 -0.700000
50% -0.253333 -0.500000
75% -0.186905 -0.300000
max 0.000000 0.000000
max_negative_polarity title_subjectivity \
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean -0.107500 0.282353
std 0.095373 0.324247
min -1.000000 0.000000
25% -0.125000 0.000000
50% -0.100000 0.150000
75% -0.050000 0.500000
max 0.000000 1.000000
title_sentiment_polarity abs_title_subjectivity \
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean 0.071425 0.341843
std 0.265450 0.188791
min -1.000000 0.000000
25% 0.000000 0.166667
50% 0.000000 0.500000
75% 0.150000 0.500000
max 1.000000 0.500000
abs_title_sentiment_polarity shares
count 39644.000000 39644.000000
unique NaN NaN
top NaN NaN
freq NaN NaN
mean 0.156064 3395.380184
std 0.226294 11626.950749
min 0.000000 1.000000
25% 0.000000 946.000000
50% 0.000000 1400.000000
75% 0.250000 2800.000000
max 1.000000 843300.000000
[11 rows x 61 columns]
Selection of the columns to be scaled depending on the variable type :
notToBeScaled = list()
toBeScaled = list()
for i in range(df.shape[1]):
if max(df.iloc[:,i]) == 1 and min(df.iloc[:,i])== 0:
notToBeScaled.append(df.iloc[:,i].name)
else :
toBeScaled.append(df.iloc[:,i].name)
toBeScaled.remove("url")
toBeScaled.remove(" timedelta")
toBeScaled.remove(" shares")
px.histogram(df.iloc[:,60] )
We can observe here that the repartition of the number of shares has some outliers (probably articles that went viral). This could impact the performance of our models. We can maybe cut the outliers (e.g. articles that have more than 15K shares).
plt.figure(figsize = (10,10))
for i in range(25):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.hist(df.iloc[:,i+1])
plt.xlabel(df.iloc[:,i+1].name)
plt.show()
Many features have outliers. Using robust scaling (based on medians) might do the trick
plt.figure(figsize=(50,40))
cor = df.corr(method ='pearson')
sns.heatmap(cor, cmap="bwr")
plt.show()
With the correlation matrix we observe that some group of features have a strong correlation with one-another, but few correlation with the rest of the dataset (like the week days). On the other hand the shares have very poor correlation with any other features.
dflog = df.copy()
dflog.loc[:," shares"] = np.log(df.loc[:," shares"])
plt.figure(figsize=(40,30))
cor = dflog.corr(method ='pearson')
sns.heatmap(cor, cmap="bwr")
plt.show()
When applying the log function to the shares, we can see a bit more contrasted correlation for some features. We shall then try to use the log(shares) feature as it might improve our scores.
In terms of feature engineering, we will, for the moment try to use all features, and then we will see if some of them are irrelevant and should be removed.
So let's split the data :
X = df.drop(["url"," shares"," timedelta"], axis = 1)
y = df[" shares"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 101) # 20 % of test data
Now let's scale our data using robust scale so that values on different scales can be compared:
scaler = RobustScaler()
scaler.fit(X_train.loc[:,toBeScaled]) #We fit only on the training data
X_train.loc[:,toBeScaled] = scaler.transform(X_train.loc[:,toBeScaled])
X_test.loc[:,toBeScaled] = scaler.transform(X_test.loc[:,toBeScaled])
The objective here is to predict a quantity (the number of shares an article gets). We have less than 100K examples, and we have many features that might be important. We shall thus begin to try two models:
from sklearn.linear_model import Ridge
model = Ridge(alpha = 1)
model.fit(X_train, y_train)
model.score(X_test, y_test)
0.03482004060582422
from sklearn.model_selection import validation_curve
alpha = np.logspace(-6, 6, 13)
train_score, val_score = validation_curve(model, X_train, y_train, "alpha", alpha, cv=5)
plt.plot(val_score.mean(axis=1))
[<matplotlib.lines.Line2D at 0x1725913d130>]
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 300,max_depth = 10 ,max_features = 'auto')
model.fit(X_train, y_train)
model.score(X_test, y_test)
-0.09356212338974856
from sklearn import metrics
def Metric(y_test, predicted_test):
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predicted_test))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predicted_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predicted_test)))
predicted_test = model.predict(X_test)
Metric(y_test, predicted_test)
Mean Absolute Error: 3096.6278505513305 Mean Squared Error: 70353598.4907881 Root Mean Squared Error: 8387.705198133044
At this point the results are pretty terrible: on average, the model prediction makes an error of 8564 shares.
We will try to reduce the number of features and apply the log function to the target feature.
dflog = dflog.loc[:,[" LDA_02"," is_weekend", " kw_avg_avg"," data_channel_is_world", " average_token_length", " rate_negative_words"," shares"]]
We selected the features that had the strongest correlation according to the correlation matrix.
X = dflog.drop(" shares", axis = 1)
y = dflog[" shares"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 101) # 20 % of test data
model = RandomForestRegressor(n_estimators = 300,max_depth = 10 ,max_features = 'auto')
model.fit(X_train, y_train)
model.score(X_test, y_test)
0.08974408527583078
predicted_test = model.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predicted_test))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predicted_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predicted_test)))
Mean Absolute Error: 0.6645050829119794 Mean Squared Error: 0.7938541987827277 Root Mean Squared Error: 0.8909849599082622
Even though the mean errors seem smaller when we apply the log function on the target feature, this reduction is due to the fact that the scale of the data in that feature is then much smaller (as it has the log function applied), thus the mean error is smaller as well, but the regression is not necessarily more accurate.
The results we get are terrible, there are not enough linear or logarithmic relations between the features to apply a regression
After reading the paper published by the creators of the dataset, we realized that they were actually applying classification algorithms. The target feature is numerical, but they set a threshold, where every article having more shares than that threshold would be considered "popular", and other articles would be considered "not popular". That way we would have only two different values for that feature, making it catergorical.
df[" shares"].describe(percentiles = [.50, .80, .85, .90])
count 39644.000000 mean 3395.380184 std 11626.950749 min 1.000000 50% 1400.000000 80% 3400.000000 85% 4400.000000 90% 6200.000000 max 843300.000000 Name: shares, dtype: float64
The mean value is close to the 80th percentile value. Thus if a number of share is above average, it means that it is better than 80% of the articles of the dataset.
We will then set our threshold to 3395 to sort the articles between not popular (0) and popular (1)
popularity_threshold = 3395
df["popular"] = np.where(df[" shares"]>popularity_threshold,1,0)
df.head()
| url | timedelta | n_tokens_title | n_tokens_content | n_unique_tokens | n_non_stop_words | n_non_stop_unique_tokens | num_hrefs | num_self_hrefs | num_imgs | ... | max_positive_polarity | avg_negative_polarity | min_negative_polarity | max_negative_polarity | title_subjectivity | title_sentiment_polarity | abs_title_subjectivity | abs_title_sentiment_polarity | shares | popular | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | http://mashable.com/2013/01/07/amazon-instant-... | 731.0 | 12.0 | 219.0 | 0.663594 | 1.0 | 0.815385 | 4.0 | 2.0 | 1.0 | ... | 0.7 | -0.350000 | -0.600 | -0.200000 | 0.500000 | -0.187500 | 0.000000 | 0.187500 | 593 | 0 |
| 1 | http://mashable.com/2013/01/07/ap-samsung-spon... | 731.0 | 9.0 | 255.0 | 0.604743 | 1.0 | 0.791946 | 3.0 | 1.0 | 1.0 | ... | 0.7 | -0.118750 | -0.125 | -0.100000 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 711 | 0 |
| 2 | http://mashable.com/2013/01/07/apple-40-billio... | 731.0 | 9.0 | 211.0 | 0.575130 | 1.0 | 0.663866 | 3.0 | 1.0 | 1.0 | ... | 1.0 | -0.466667 | -0.800 | -0.133333 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1500 | 0 |
| 3 | http://mashable.com/2013/01/07/astronaut-notre... | 731.0 | 9.0 | 531.0 | 0.503788 | 1.0 | 0.665635 | 9.0 | 0.0 | 1.0 | ... | 0.8 | -0.369697 | -0.600 | -0.166667 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1200 | 0 |
| 4 | http://mashable.com/2013/01/07/att-u-verse-apps/ | 731.0 | 13.0 | 1072.0 | 0.415646 | 1.0 | 0.540890 | 19.0 | 19.0 | 20.0 | ... | 1.0 | -0.220192 | -0.500 | -0.050000 | 0.454545 | 0.136364 | 0.045455 | 0.136364 | 505 | 0 |
5 rows × 62 columns
X = df.drop([" shares", "url", " timedelta", "popular"], axis = 1)
y = df["popular"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
scaler = RobustScaler()
scaler.fit(X_train.loc[:,toBeScaled]) #We fit only on the training data
X_train.loc[:,toBeScaled] = scaler.transform(X_train.loc[:,toBeScaled])
X_test.loc[:,toBeScaled] = scaler.transform(X_test.loc[:,toBeScaled])
We will try four different classification algorithms and see which one classifies the best the data :
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import pickle
from sklearn.ensemble import RandomForestClassifier
clf_rforest = RandomForestClassifier()
clf_rforest.fit(X_train, y_train)
RandomForestClassifier()
Using only accuracy to evaluate the performance of our model would not be clever as the repartition between the popular and not popular articles is 20%/80%. This we will need other metrics to evaluate it:
y_pred_rforest = clf_rforest.predict(X_test)
print(classification_report(y_test, y_pred_rforest, digits = 4))
precision recall f1-score support
0 0.8034 0.9858 0.8853 6319
1 0.4886 0.0534 0.0963 1610
accuracy 0.7964 7929
macro avg 0.6460 0.5196 0.4908 7929
weighted avg 0.7395 0.7964 0.7251 7929
def ConfusionMatrix(cfMatrix):
sns.heatmap(cfMatrix, annot=True, cmap='Reds')
ConfusionMatrix(confusion_matrix(y_test, clf_rforest.predict(X_test)))
rf_probs = clf_rforest.predict_proba(X_test)
rf_probs = rf_probs[:,1]
rf_auc = roc_auc_score(y_test, rf_probs)
print("RandomForest : AUROC = %.3f" % (rf_auc))
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
plt.plot(rf_fpr, rf_tpr, marker=".", label="RandomForest (AUROC = %0.3f)" % rf_auc)
RandomForest : AUROC = 0.700
[<matplotlib.lines.Line2D at 0x17258eba640>]
We could be satisfied with a AUC score of 70%, but we will see if other models perform better:
from sklearn.ensemble import AdaBoostClassifier
clf_ada = AdaBoostClassifier()
clf_ada.fit(X_train, y_train)
AdaBoostClassifier()
y_pred_ada = clf_ada.predict(X_test)
print(classification_report(y_test, y_pred_ada, digits = 4))
precision recall f1-score support
0 0.8057 0.9805 0.8846 6319
1 0.4854 0.0720 0.1255 1610
accuracy 0.7961 7929
macro avg 0.6455 0.5263 0.5050 7929
weighted avg 0.7407 0.7961 0.7304 7929
ConfusionMatrix(confusion_matrix(y_test, clf_ada.predict(X_test)))
ada_probs = clf_ada.predict_proba(X_test)
ada_probs = ada_probs[:,1]
ada_auc = roc_auc_score(y_test, ada_probs)
print("AdaBoost : AUROC = %.3f" % (ada_auc))
ada_fpr, ada_tpr, _ = roc_curve(y_test, ada_probs)
plt.plot(ada_fpr, ada_tpr, marker=".", label="AdaBoost (AUROC = %0.3f)" % ada_auc)
AdaBoost : AUROC = 0.705
[<matplotlib.lines.Line2D at 0x17258b65790>]
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)
KNeighborsClassifier()
y_pred_knn = clf_knn.predict(X_test)
print(classification_report(y_test, y_pred_knn, digits = 4))
precision recall f1-score support
0 0.8130 0.9416 0.8726 6319
1 0.3951 0.1497 0.2171 1610
accuracy 0.7808 7929
macro avg 0.6040 0.5456 0.5448 7929
weighted avg 0.7281 0.7808 0.7395 7929
ConfusionMatrix(confusion_matrix(y_test, clf_knn.predict(X_test)))
knn_probs = clf_knn.predict_proba(X_test)
knn_probs = knn_probs[:,1]
knn_auc = roc_auc_score(y_test, knn_probs)
print("KNN : AUROC = %.3f" % (knn_auc))
knn_fpr, knn_tpr, _ = roc_curve(y_test, knn_probs)
plt.plot(knn_fpr, knn_tpr, marker=".", label="KNN (AUROC = %0.3f)" % knn_auc)
KNN : AUROC = 0.621
[<matplotlib.lines.Line2D at 0x172591d19d0>]
from sklearn.naive_bayes import GaussianNB
clf_GNB = GaussianNB()
clf_GNB.fit(X_train, y_train)
GaussianNB()
y_pred_GNB = clf_GNB.predict(X_test)
print(classification_report(y_test, y_pred_GNB, digits = 4))
precision recall f1-score support
0 0.7980 0.9756 0.8779 6319
1 0.2414 0.0304 0.0541 1610
accuracy 0.7837 7929
macro avg 0.5197 0.5030 0.4660 7929
weighted avg 0.6849 0.7837 0.7106 7929
ConfusionMatrix(confusion_matrix(y_test, clf_GNB.predict(X_test)))
GNB_probs = clf_GNB.predict_proba(X_test)
GNB_probs = GNB_probs[:,1]
GNB_auc = roc_auc_score(y_test, GNB_probs)
print("GNB : AUROC = %.3f" % (GNB_auc))
GNB_fpr, GNB_tpr, _ = roc_curve(y_test, GNB_probs)
plt.plot(GNB_fpr, GNB_tpr, marker=".", label="GNB (AUROC = %0.3f)" % GNB_auc)
GNB : AUROC = 0.579
[<matplotlib.lines.Line2D at 0x17258d33e50>]
Now that we have tested each model, let us compare the different scores we got:
model = ["Random Forest", "Adaptive Boosting", "KNN", "Naive Bayes"]
score = [rf_auc, ada_auc, knn_auc, GNB_auc]
scores = pd.DataFrame({"Model" : model, "Score AUC" : score})
scores
| Model | Score AUC | |
|---|---|---|
| 0 | Random Forest | 0.700291 |
| 1 | Adaptive Boosting | 0.704720 |
| 2 | KNN | 0.620676 |
| 3 | Naive Bayes | 0.579315 |
fig = px.bar(scores, x='Model', y='Score AUC')
fig.show()
It is then the Adaboost that gave us the best AUC score. Let us then try to optimize it:
To improve our model we will tune it with a GridSearch:
from sklearn.model_selection import GridSearchCV
It will test each parameter of the list to find which one gives the best score.
params = {'n_estimators' : [50, 100, 200, 300]}
grid_ada = GridSearchCV(clf_ada, params, cv = 5,scoring='roc_auc')
grid_ada.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
param_grid={'n_estimators': [50, 100, 200, 300]},
scoring='roc_auc')
grid_ada.best_estimator_, grid_ada.best_score_
(AdaBoostClassifier(n_estimators=100), 0.7028255256635939)
grid_ada.best_estimator_.score(X_test, y_test)
0.7972001513431707
ConfusionMatrix(confusion_matrix(y_test, grid_ada.best_estimator_.predict(X_test)))
grid_ada_probs = grid_ada.best_estimator_.predict_proba(X_test)
grid_ada_probs = grid_ada_probs[:,1]
grid_ada_auc = roc_auc_score(y_test, grid_ada_probs)
print("Final Model : AUROC = %.3f" % (grid_ada_auc))
grid_ada_fpr, grid_ada_tpr, _ = roc_curve(y_test, grid_ada_probs)
plt.plot(grid_ada_fpr, grid_ada_tpr, marker=".", label="Final Model (AUROC = %0.3f)" % grid_ada_auc)
Final Model : AUROC = 0.706
[<matplotlib.lines.Line2D at 0x172590f7610>]
Despite struggling at the beginning with the regression, we eventually manage to obtain a great score with our improved Adaboost model, thanks to a scaling, a model selection, and an optimization of the hyperparameters with a gridsearch.